In this data visualization, we will be looking only at homes with the homeowner’s exemption that maintained that exemption from 2007-2016. This means homes that are sold from one primary resident to another. The “clean_data.rmd” notebook contains all the code for cleaning, subsetting, and preparing the data.
Loading libraries and the data:
library(tidyverse)
library(ggplot2)
library(scales)
library(plotly)
dat = read_rds("./data/compressed_assessors_data_subset.rds")
First let’s look at the data from the most recent year. The total taxable assessments are plotted below with the x axis with and without log scale, because of the long tail of very high taxable assessments. The assessments range from ~$7,500 to ~$14,000,000 with a mean of ~$480,000 and a mean of ~$360,000.
currdat = dat %>% filter(`Closed Roll Year` == 2016)
round(summary(currdat$`Total Taxable Assessment`))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7586 154442 359064 485878 648759 14313114
p = ggplot(data = currdat) + geom_histogram(aes(x = `Total Taxable Assessment`),
binwidth = 1000)
p
p = ggplot(data = currdat) + geom_histogram(aes(x = `Total Taxable Assessment`),
binwidth = 0.01) + scale_x_continuous(trans = "log")
p
Here we can see how these numbers have changed over time.
tmp1 = dat %>% group_by(`Closed Roll Year`) %>% summarise_at(vars(`Total Taxable Assessment`),
mean, na.rm = TRUE) %>% mutate(statistic = "Mean")
tmp2 = dat %>% group_by(`Closed Roll Year`) %>% summarise_at(vars(`Total Taxable Assessment`),
median, na.rm = TRUE) %>% mutate(statistic = "Median")
tmp3 = dat %>% group_by(`Closed Roll Year`) %>% summarise_at(vars(`Total Taxable Assessment`),
min, na.rm = TRUE) %>% mutate(statistic = "Min")
tmp4 = dat %>% group_by(`Closed Roll Year`) %>% summarise_at(vars(`Total Taxable Assessment`),
max, na.rm = TRUE) %>% mutate(statistic = "Max")
summary_tta = do.call(rbind, list(tmp1, tmp2, tmp3,
tmp4))
p = ggplot(data = summary_tta) + geom_line(aes(x = `Closed Roll Year`,
y = `Total Taxable Assessment`, group = statistic,
color = statistic)) + ggtitle("Total Taxable Assessments over Time") +
scale_y_continuous(trans = "log", breaks = function(x) unique(floor(pretty(seq(0,
(max(x) + 1) * 1.1)))))
p
p = ggplot(data = summary_tta %>% filter(statistic ==
"Mean")) + geom_line(aes(x = `Closed Roll Year`,
y = `Total Taxable Assessment`)) + ylim(2e+05,
5e+05) + ggtitle("Mean Total Taxable Assessment over Time")
p
p = ggplot(data = summary_tta %>% filter(statistic ==
"Median")) + geom_line(aes(x = `Closed Roll Year`,
y = `Total Taxable Assessment`)) + ylim(2e+05,
5e+05) + ggtitle("Median Total Taxable Assessment over Time")
p
p = ggplot(data = summary_tta %>% filter(statistic ==
"Max")) + geom_line(aes(x = `Closed Roll Year`,
y = `Total Taxable Assessment`)) + ggtitle("Max Total Taxable Assessment over Time")
p
p = ggplot(data = summary_tta %>% filter(statistic ==
"Min")) + geom_line(aes(x = `Closed Roll Year`,
y = `Total Taxable Assessment`)) + ggtitle("Min Total Taxable Assessment over Time")
p
We can see the trends in these statistics by fitting each with a linear model. The maximum assesssment increases and average of ~$175,000 each year, the minimum assessment slope is calculated at $60 each year, but this number is skewed by the ver low minimum in 2014. Exluding this get us an average of ~$180 per year. The mean assessment and the median assessment increase at ~$10,000 and ~$6000 per year. A next step would be to separate the data further into neighborhoods, and perhaps houses that are sold each year, since it is those larger jumps in value that would be useful to predict for home sellers.
summary_tta %>% group_by(statistic) %>% summarise(`Slope Total Assessment` = lm(`Total Taxable Assessment` ~
`Closed Roll Year`)$coefficients[[2]])
summary_tta %>% filter(`Closed Roll Year` != 2014,
statistic == "Min") %>% summarise(`Slope Total Assessment` = lm(`Total Taxable Assessment` ~
`Closed Roll Year`)$coefficients[[2]])
Below are plotted the earliest and latest years on record, to estimate the dates the property was built and last sold. There definitely seems to be an issue with the dates the properties were last sold, as there are large peaks every 10 years, and apparently a lot of missing data in the early 2000s.
p = ggplot(data = currdat %>% filter(!is.na(`Earliest Year`))) +
geom_histogram(aes(x = as.numeric(`Earliest Year`)),
binwidth = 1) + xlim(1900, 2016)
p
p = ggplot(data = currdat %>% filter(!is.na(`Latest Year`))) +
geom_histogram(aes(x = as.numeric(`Latest Year`)),
binwidth = 1) + xlim(1900, 2016)
p
Here we can look at the current total taxable Assessments for different neighborhoods in San Francisco in 2016, as well as the average change in total taxable assessments per year. I have plotted them normally and in log scale for easier comparison of the medians of the neighborhoods.
p = ggplot(data = currdat) + geom_boxplot(aes(x = `Analysis Neighborhood`,
y = `Total Taxable Assessment`)) + ggtitle("Total Taxable Assessment in 2016") +
theme(axis.text.x = element_text(angle = 90, hjust = 1,
vjust = 0.5))
p
p = ggplot(data = currdat) + geom_boxplot(aes(x = `Analysis Neighborhood`,
y = `Total Taxable Assessment`)) + ggtitle("Total Taxable Assessment in 2016") +
scale_y_continuous(trans = "log", labels = scales::number_format(accuracy = 1)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,
vjust = 0.5))
p
p = ggplot(data = currdat) + geom_boxplot(aes(x = `Analysis Neighborhood`,
y = `Slope Total Assessment`)) + ggtitle("Average Changes in Total Taxable Assessments per Year") +
theme(axis.text.x = element_text(angle = 90, hjust = 1,
vjust = 0.5))
p
p = ggplot(data = currdat) + geom_boxplot(aes(x = `Analysis Neighborhood`,
y = `Slope Total Assessment`)) + ggtitle("Average Changes in Total Taxable Assessments per Year") +
scale_y_continuous(trans = "log", labels = scales::number_format(accuracy = 1)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,
vjust = 0.5))
p
Here we’ll look in more detail at how the total assessments have changed in different neighborhoods over the years, for properties that are listed as having sold since 2006 and those that are not listed as having sold.
neighborhoods_count = currdat %>% group_by(`Analysis Neighborhood`) %>%
summarise(count = n())
neighborhoods = neighborhoods_count %>% filter(count >=
1000) %>% select(`Analysis Neighborhood`)
neighborhoods = unique(neighborhoods$`Analysis Neighborhood`)
pAssessTime = function(nhood) {
# print(nhood)
currdat2 = dat %>% filter(`Analysis Neighborhood` ==
nhood)
p = ggplot(data = currdat2) + geom_line(aes(x = `Closed Roll Year`,
y = `Total Taxable Assessment`, group = `Parcel Number`,
color = `Earliest Year`)) + scale_color_distiller(palette = "Spectral") +
ggtitle(paste(nhood)) + theme_dark() + facet_wrap(~as.factor(`Latest Year` >=
2006))
# facet_wrap(~as.factor(as.numeric(format(`Current
# Sales Date`,'%Y')) >= 2006))
return(p)
}
lapply(neighborhoods, pAssessTime)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]